HD OE 1Y Foxg1&Flag CUT&TAG Final

Alignment

cores=20
ref="/data1/jiawei_li/yizhi_chen/ref/GRCm39"
projPath="/data1/jiawei_li/yizhi_chen/foxg1_cut_tag/raw_data_fasta"
mkdir -p ${projPath}/alignment/sam/bowtie2_summary
mkdir -p ${projPath}/alignment/bam
mkdir -p ${projPath}/alignment/bed
mkdir -p ${projPath}/alignment/bedgraph

cat filenames
    Foxg1
    Flag
    IgG
    
cat filenames| while read histName
do
bowtie2 --local --very-sensitive --no-mixed --no-discordant --phred33 -I 10 -X 1000 -p ${cores} -x ${ref} -1 ${projPath}/${histName}_R1.fq.gz -2 ${projPath}/${histName}_R2.fq.gz -S ${projPath}/alignment/sam/${histName}_bowtie2.sam &> ${projPath}/alignment/sam/bowtie2_summary/${histName}_bowtie2.txt
done
mkdir -p $projPath/alignment/sam/fragmentLen

## Extract the 9th column from the alignment sam file which is the fragment length
cat filenames| while read histName
do
samtools view -F 0x04 $projPath/alignment/sam/${histName}_bowtie2.sam | awk -F'\t' 'function abs(x){return ((x < 0.0) ? -x : x)} {print abs($9)}' | sort | uniq -c | awk -v OFS="\t" '{print $2, $1/2}' >$projPath/alignment/sam/fragmentLen/${histName}_fragmentLen.txt
done
## R command
library(ggplot2)
library(viridis)
library(GenomicRanges)
library(stringr)
library(ggpubr)
library(magrittr)
library(dplyr)
setwd("E:/硕士/bioinformatics/Foxg1 CUT&TAG/重测序/比对")
projPath = "E:/硕士/bioinformatics/Foxg1 CUT&TAG/重测序/比对"
histList = c("Foxg1", "Flag", "IgG")
sampleList = histList
## Collect the alignment results from the bowtie2 alignment summary files
alignResult = c()
for(hist in sampleList){
  alignRes = read.table(paste0(projPath, "/", hist, "_bowtie2.txt"), header = FALSE, fill = TRUE)
  alignRate = substr(alignRes$V1[6], 1, nchar(as.character(alignRes$V1[6]))-1)
  histInfo = strsplit(hist, "_")[[1]]
  alignResult = data.frame(Histone = histInfo[1], Replicate = histInfo[2], 
                           SequencingDepth = alignRes$V1[1] %>% as.character %>% as.numeric, 
                           MappedFragNum_GRCm39 = alignRes$V1[4] %>% as.character %>% as.numeric + alignRes$V1[5] %>% as.character %>% as.numeric, 
                           AlignmentRate_GRCm39 = alignRate %>% as.numeric)  %>% rbind(alignResult, .)
}

alignResult$Histone = factor(alignResult$Histone, levels = histList)
alignResult %>% mutate(AlignmentRate_GRCm39 = paste0(AlignmentRate_GRCm39, "%"))

## Generate sequencing depth boxplot
pdf('alignment results.pdf',height = 10,width = 10)
fig1 = alignResult %>% ggplot(aes(x = Histone, y = SequencingDepth/1000000, fill = Histone)) +
    geom_boxplot() +
    geom_jitter(aes(color = Replicate), position = position_jitter(0.15)) +
    scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "magma", alpha = 0.8) +
    scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
    theme_bw(base_size = 18) +
    ylab("Sequencing Depth per Million") +
    xlab("") + 
    ggtitle("A. Sequencing Depth")

fig2 = alignResult %>% ggplot(aes(x = Histone, y = MappedFragNum_GRCm39/1000000, fill = Histone)) +
    geom_boxplot() +
    geom_jitter(aes(color = Replicate), position = position_jitter(0.15)) +
    scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "magma", alpha = 0.8) +
    scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
    theme_bw(base_size = 18) +
    ylab("Mapped Fragments per Million") +
    xlab("") +
    ggtitle("B. Alignable Fragment (GRCm39)")

fig3 = alignResult %>% ggplot(aes(x = Histone, y = AlignmentRate_GRCm39, fill = Histone)) +
    geom_boxplot() +
    geom_jitter(aes(color = Replicate), position = position_jitter(0.15)) +
    scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "magma", alpha = 0.8) +
    scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
    theme_bw(base_size = 18) +
    ylab("% of Mapped Fragments") +
    xlab("") +
    ggtitle("C. Alignment Rate (GRCm39)")

ggarrange(fig1, fig2, fig3, ncol = 2, nrow=2, common.legend = TRUE, legend="bottom")
dev.off()


## Collect the fragment size information
fragLen = c()
for(hist in sampleList){
  histInfo = strsplit(hist, "_")[[1]]
  fragLen = read.table(paste0(projPath, "/", hist, "_fragmentLen.txt"), header = FALSE) %>% mutate(fragLen = V1 %>% as.numeric, fragCount = V2 %>% as.numeric, Weight = as.numeric(V2)/sum(as.numeric(V2)), Histone = histInfo[1], Replicate = histInfo[2], sampleInfo = hist) %>% rbind(fragLen, .)  }

fragLen$sampleInfo = factor(fragLen$sampleInfo, levels = sampleList)
fragLen$Histone = factor(fragLen$Histone, levels = histList)
## Generate the fragment size density plot (violin plot)

pdf('fragment size distribution.pdf',height = 6,width = 12)
fig5 = fragLen %>% ggplot(aes(x = sampleInfo, y = fragLen, weight = Weight, fill = Histone)) +
  geom_violin(bw = 5) +
  scale_y_continuous(breaks = seq(0, 800, 50)) +
  scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "magma", alpha = 0.8) +
  scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
  theme_bw(base_size = 20) +
  ggpubr::rotate_x_text(angle = 20) +
  ylab("Fragment Length") +
  xlab("")

fig6 = fragLen %>% ggplot(aes(x = fragLen, y = fragCount, color = Histone, group = sampleInfo, linetype = sampleInfo)) +
  geom_line(linewidth = 1) +
  scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "magma") +
  theme_bw(base_size = 20) +
  xlab("Fragment Length") +
  ylab("Count") +
  coord_cartesian(xlim = c(0, 500))

ggarrange(fig5, fig6, ncol = 2)
dev.off()

Remove duplicates

##== linux command ==##
## depending on how you load picard and your server environment, the picardCMD can be different. Adjust accordingly.
picardCMD=" java -jar /data1/jiawei_li/yizhi_chen/foxg1_cut_tag/picard/build/libs/picard.jar"
mkdir -p $projPath/alignment/removeDuplicate/picard_summary

## Sort by coordinate
cat filenames| while read histName
do
$picardCMD SortSam I=$projPath/alignment/sam/${histName}_bowtie2.sam O=$projPath/alignment/sam/${histName}_bowtie2.sorted.sam SORT_ORDER=coordinate
done

## mark duplicates
cat filenames| while read histName
do
$picardCMD MarkDuplicates I=$projPath/alignment/sam/${histName}_bowtie2.sorted.sam O=$projPath/alignment/removeDuplicate/${histName}_bowtie2.sorted.dupMarked.sam METRICS_FILE=$projPath/alignment/removeDuplicate/picard_summary/${histName}_picard.dupMark.txt
done

## remove duplicates
cat filenames| while read histName
do
$picardCMD MarkDuplicates I=$projPath/alignment/sam/${histName}_bowtie2.sorted.sam O=$projPath/alignment/removeDuplicate/${histName}_bowtie2.sorted.rmDup.sam REMOVE_DUPLICATES=true METRICS_FILE=$projPath/alignment/removeDuplicate/picard_summary/${histName}_picard.rmDup.txt
done

Visualization

##=== R command ===## 
library(ggplot2)
library(viridis)
library(GenomicRanges)
library(stringr)
library(ggpubr)
library(magrittr)
library(dplyr)
setwd("E:/硕士/bioinformatics/Foxg1 CUT&TAG/重测序/picard")
projPath = "E:/硕士/bioinformatics/Foxg1 CUT&TAG/重测序"
histList = c("Foxg1", "Flag", "IgG")
sampleList = histList
alignResult = c()
for(hist in sampleList){
  alignRes = read.table(paste0(projPath, "/比对/", hist, "_bowtie2.txt"), header = FALSE, fill = TRUE)
  alignRate = substr(alignRes$V1[6], 1, nchar(as.character(alignRes$V1[6]))-1)
  histInfo = strsplit(hist, "_")[[1]]
  alignResult = data.frame(Histone = histInfo[1], Replicate = histInfo[2], 
                           SequencingDepth = alignRes$V1[1] %>% as.character %>% as.numeric, 
                           MappedFragNum_GRCm39 = alignRes$V1[4] %>% as.character %>% as.numeric + alignRes$V1[5] %>% as.character %>% as.numeric, 
                           AlignmentRate_GRCm39 = alignRate %>% as.numeric)  %>% rbind(alignResult, .)
}

alignResult$Histone = factor(alignResult$Histone, levels = histList)
alignResult %>% mutate(AlignmentRate_GRCm39 = paste0(AlignmentRate_GRCm39, "%"))
AlignmentRate_GRCm39 = alignResult %>% mutate(AlignmentRate_GRCm39 = paste0(AlignmentRate_GRCm39, "%"))
dupResult = c()
for(hist in sampleList){
  dupRes = read.table(paste0(projPath, "/picard/", hist, "_picard.rmDup.txt"), header = TRUE, fill = TRUE)
  
  histInfo = strsplit(hist, "_")[[1]]
  dupResult = data.frame(Histone = histInfo[1], Replicate = histInfo[2], 
                         MappedFragNum_GRCm39 = dupRes$READ_PAIRS_EXAMINED[1] %>% as.character %>% as.numeric, 
                         DuplicationRate = dupRes$PERCENT_DUPLICATION[1] %>% as.character %>% as.numeric * 100, 
                         EstimatedLibrarySize = dupRes$ESTIMATED_LIBRARY_SIZE[1] %>% as.character %>% as.numeric) %>% mutate(UniqueFragNum = MappedFragNum_GRCm39 * (1-DuplicationRate/100))  %>% rbind(dupResult, .)
}
dupResult$Histone = factor(dupResult$Histone, levels = histList)
alignDupSummary = left_join(alignResult, dupResult, by = c("Histone", "Replicate", "MappedFragNum_GRCm39")) %>% mutate(DuplicationRate = paste0(DuplicationRate, "%"))
alignDupSummary

## generate boxplot figure for the  duplication rate
fig7 = dupResult %>% ggplot(aes(x = Histone, y = DuplicationRate, fill = Histone)) +
  geom_boxplot() +
  geom_jitter(aes(color = Replicate), position = position_jitter(0.15)) +
  scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "magma", alpha = 0.8) +
  scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
  theme_bw(base_size = 18) +
  ylab("Duplication Rate (*100%)") +
  xlab("") 

fig8 = dupResult %>% ggplot(aes(x = Histone, y = EstimatedLibrarySize, fill = Histone)) +
  geom_boxplot() +
  geom_jitter(aes(color = Replicate), position = position_jitter(0.15)) +
  scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "magma", alpha = 0.8) +
  scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
  theme_bw(base_size = 18) +
  ylab("Estimated Library Size") +
  xlab("") 

fig9 = dupResult %>% ggplot(aes(x = Histone, y = UniqueFragNum, fill = Histone)) +
  geom_boxplot() +
  geom_jitter(aes(color = Replicate), position = position_jitter(0.15)) +
  scale_fill_viridis(discrete = TRUE, begin = 0.1, end = 0.9, option = "magma", alpha = 0.8) +
  scale_color_viridis(discrete = TRUE, begin = 0.1, end = 0.9) +
  theme_bw(base_size = 18) +
  ylab("# of Unique Fragments") +
  xlab("")

ggarrange(fig7, fig8, fig9, ncol = 3, common.legend = TRUE, legend="bottom")

File format conversion for SEACR

Tutorial [unnecessary]

##== linux command ==##

## Build up effective chromsizes file

samtools faidx /data1/jiawei_li/yizhi_chen/foxg1_cut_tag/fasta/Mus_musculus.GRCm39.dna.toplevel.fa
cut -f1,2 Mus_musculus.GRCm39.dna.toplevel.fa.fai > GRCm39.chrom.sizes

## Filter and keep the mapped read pairs

cat filenames| while read histName
do
samtools view -bS -F 0x04 $projPath/alignment/sam/${histName}_bowtie2.sam >$projPath/alignment/bam/${histName}_bowtie2.mapped.bam
done

## Convert into bed file format

cat filenames| while read histName
do
bedtools bamtobed -i $projPath/alignment/bam/${histName}_bowtie2.mapped.bam -bedpe >$projPath/alignment/bed/${histName}_bowtie2.bed
done

## Keep the read pairs that are on the same chromosome and fragment length less than 1000bp.

cat filenames| while read histName
do
awk '$1==$4 && $6-$2 < 1000 {print $0}' $projPath/alignment/bed/${histName}_bowtie2.bed >$projPath/alignment/bed/${histName}_bowtie2.clean.bed
done

## Only extract the fragment related columns

cat filenames| while read histName
do
cut -f 1,2,6 $projPath/alignment/bed/${histName}_bowtie2.clean.bed | sort -k1,1 -k2,2n -k3,3n  >$projPath/alignment/bed/${histName}_bowtie2.fragments.bed
done


## Scale Factor

cat filenames| while read histName
do
seqDepthDouble=`samtools view -F 0x04 $projPath/alignment/sam/${histName}_bowtie2.sam|wc -l`
seqDepth=$((seqDepthDouble/2))
if [[ "$seqDepth" -gt "1" ]]; then
    
    mkdir -p $projPath/alignment/bedgraph

    scale_factor=`echo "1000000 / $seqDepth" | bc -l`   #所以RPKM就是1000000
    echo "Scaling factor for $histName is: $scale_factor!"
    bedtools genomecov -bg -scale $scale_factor -i $projPath/alignment/bed/${histName}_bowtie2.fragments.bed -g $chromSize > $projPath/alignment/bedgraph/${histName}_bowtie2.fragments.normalized.bedgraph
    
fi
done


BamCoverage To BEDGraph/Bigwig (RPGC/RPKM normalize) [recommanded]

usage: An example usage is:$ bamCoverage -b reads.bam -o coverage.bw

 *bamCoverage* offers normalization by scaling factor, Reads Per Kilobase per Million mapped reads (RPKM), counts per million (CPM), bins per
million mapped reads (BPM) and 1x depth (reads per genome coverage, RPGC).
Required arguments:
  --bam BAM file, -b BAM file
                        BAM file to process (default: None)
Output:
  --outFileName FILENAME, -o FILENAME
                        Output file name. (default: None)
  --outFileFormat {bigwig,bedgraph}, -of {bigwig,bedgraph}
                        Output file type. Either "bigwig" or "bedgraph". (default: bigwig)
Optional arguments:
  --help, -h            show this help message and exit
  --scaleFactor SCALEFACTOR
                        The computed scaling factor (or 1, if not applicable) will be multiplied by this. (Default: 1.0)
  --MNase               Determine nucleosome positions from MNase-seq data. Only 3 nucleotides at the center of each fragment are counted. The fragment ends are
                        defined by the two mate reads. Only fragment lengthsbetween 130 - 200 bp are considered to avoid dinucleosomes or other artifacts. By default,
                        any fragments smaller or larger than this are ignored. To over-ride this, use the --minFragmentLength and --maxFragmentLength options, which
                        will default to 130 and 200 if not otherwise specified in the presence of --MNase. *NOTE*: Requires paired-end data. A bin size of 1 is
                        recommended. (default: False)
  --Offset INT [INT ...]
                        Uses this offset inside of each read as the signal. This is useful in cases like RiboSeq or GROseq, where the signal is 12, 15 or 0 bases past
                        the start of the read. This can be paired with the --filterRNAstrand option. Note that negative values indicate offsets from the end of each
                        read. A value of 1 indicates the first base of the alignment (taking alignment orientation into account). Likewise, a value of -1 is the last
                        base of the alignment. An offset of 0 is not permitted. If two values are specified, then they will be used to specify a range of positions.
                        Note that specifying something like --Offset 5 -1 will result in the 5th through last position being used, which is equivalent to trimming 4
                        bases from the 5-prime end of alignments. Note that if you specify --centerReads, the centering will be performed before the offset. (default:
                        None)
  --filterRNAstrand {forward,reverse}
                        Selects RNA-seq reads (single-end or paired-end) originating from genes on the given strand. This option assumes a standard dUTP-based library
                        preparation (that is, --filterRNAstrand=forward keeps minus-strand reads, which originally came from genes on the forward strand using a dUTP-
                        based method). Consider using --samExcludeFlag instead for filtering by strand in other contexts. (default: None)
  --version             show program's version number and exit
  --binSize INT bp, -bs INT bp
                        Size of the bins, in bases, for the output of the bigwig/bedgraph file. (Default: 50)
  --region CHR:START:END, -r CHR:START:END
                        Region of the genome to limit the operation to - this is useful when testing parameters to reduce the computing time. The format is
                        chr:start:end, for example --region chr10 or --region chr10:456700:891000. (default: None)
  --blackListFileName BED file [BED file ...], -bl BED file [BED file ...]
                        A BED or GTF file containing regions that should be excluded from all analyses. Currently this works by rejecting genomic chunks that happen to
                        overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the
                        read/fragment might still be considered. Please note that you should adjust the effective genome size, if relevant. (default: None)
  --numberOfProcessors INT, -p INT
                        Number of processors to use. Type "max/2" to use half the maximum number of processors or "max" to use all available processors. (Default: 1)
  --verbose, -v         Set to see processing messages. (default: False)

Read coverage normalization options:
  --effectiveGenomeSize EFFECTIVEGENOMESIZE
                        The effective genome size is the portion of the genome that is mappable. Large fractions of the genome are stretches of NNNN that should be
                        discarded. Also, if repetitive regions were not included in the mapping of reads, the effective genome size needs to be adjusted accordingly. A
                        table of values is available here: http://deeptools.readthedocs.io/en/latest/content/feature/effectiveGenomeSize.html . (default: None)
  --normalizeUsing {RPKM,CPM,BPM,RPGC,None}
                        Use one of the entered methods to normalize the number of reads per bin. By default,no normalization is performed. 
                        RPKM = Reads Per Kilobase per Million mapped reads; 
                        CPM = Counts Per Million mapped reads, same as CPM in RNA-seq; 
                        BPM = Bins Per Million mapped reads, same as TPM in RNA-seq; 
                        RPGC = reads per genomic content (1x normalization); Mapped reads are considered after blacklist filtering (if applied). 
                        RPKM (per bin) = number of reads per bin / (number of mapped reads (in millions) * bin length (kb)). 
                        CPM (per bin) = number of reads per bin / number of mapped reads (in millions). 
                        BPM (per bin) = number of reads per bin / sum of all reads per bin (in millions). RPGC (per bin) = number of reads per bin / scaling factor for 1x average coverage. None = the default and equivalent to not setting this option at all. This scaling factor, in turn, is determined from the sequencing depth: (total number of mapped reads * fragment length) / effective genome size. 
                        The scaling factor used is the inverse of the sequencing depth computed for the sample to match the 1x coverage. This option requires --effectiveGenomeSize. Each read is considered independently, if you want to only count one mate from a pair in paired-end data, then use the --samFlagInclude/--samFlagExclude options. (Default: None)
  
  --exactScaling        Instead of computing scaling factors based on a sampling of the reads, process all of the reads to determine the exact number that will be used
                        in the output. This requires significantly more time to compute, but will produce more accurate scaling factors in cases where alignments that
                        are being filtered are rare and lumped together. In other words, this is only needed when region-based sampling is expected to produce
                        incorrect results. (default: False)
  --ignoreForNormalization IGNOREFORNORMALIZATION [IGNOREFORNORMALIZATION ...], -ignore IGNOREFORNORMALIZATION [IGNOREFORNORMALIZATION ...]
                        A list of space-delimited chromosome names containing those chromosomes that should be excluded for computing the normalization. This is useful when considering samples with unequal coverage across chromosomes, like male samples. An usage examples is --ignoreForNormalization chrX chrM.
                        (default: None)
  --skipNonCoveredRegions, --skipNAs
                        This parameter determines if non-covered regions (regions without overlapping reads) in a BAM file should be skipped. The default is to treat
                        those regions as having a value of zero. The decision to skip non-covered regions depends on the interpretation of the data. Non-covered
                        regions may represent, for example, repetitive regions that should be skipped. (default: False)
  --smoothLength INT bp
                        The smooth length defines a window, larger than the binSize, to average the number of reads. For example, if the --binSize is set to 20 and the
                        --smoothLength is set to 60, then, for each bin, the average of the bin and its left and right neighbors is considered. Any value smaller than
                        --binSize will be ignored and no smoothing will be applied. (default: None)

Read processing options:
  --extendReads [INT bp], -e [INT bp]
                        This parameter allows the extension of reads to fragment size. If set, each read is extended, without exception. *NOTE*: This feature is
                        generally NOT recommended for spliced-read data, such as RNA-seq, as it would extend reads over skipped regions. *Single-end*: Requires a user
                        specified value for the final fragment length. Reads that already exceed this fragment length will not be extended. *Paired-end*: Reads with
                        mates are always extended to match the fragment size defined by the two read mates. Unmated reads, mate reads that map too far apart (>4x
                        fragment length) or even map to different chromosomes are treated like single-end reads. The input of a fragment length value is optional. If
                        no value is specified, it is estimated from the data (mean of the fragment size of all mate reads). (default: False)
  --ignoreDuplicates    If set, reads that have the same orientation and start position will be considered only once. If reads are paired, the mate's position also has
                        to coincide to ignore a read. (default: False)
  --minMappingQuality INT
                        If set, only reads that have a mapping quality score of at least this are considered. (default: None)
  --centerReads         By adding this option, reads are centered with respect to the fragment length. For paired-end data, the read is centered at the fragment length
                        defined by the two ends of the fragment. For single-end data, the given fragment length is used. This option is useful to get a sharper signal
                        around enriched regions. (default: False)
  --samFlagInclude INT  Include reads based on the SAM flag. For example, to get only reads that are the first mate, use a flag of 64. This is useful to count properly
                        paired reads only once, as otherwise the second mate will be also considered for the coverage. (Default: None)
  --samFlagExclude INT  Exclude reads based on the SAM flag. For example, to get only reads that map to the forward strand, use --samFlagExclude 16, where 16 is the
                        SAM flag for reads that map to the reverse strand. (Default: None)
  --minFragmentLength INT
                        The minimum fragment length needed for read/pair inclusion. This option is primarily useful in ATACseq experiments, for filtering mono- or di-
                        nucleosome fragments. (Default: 0)
  --maxFragmentLength INT
                        The maximum fragment length needed for read/pair inclusion. (Default: 0)
##== linux command ==##
mkdir -p $projPath/alignment/bigwig                                                                                                                                        

cat filenames| while read histName
do
samtools sort -o $projPath/alignment/bam/${histName}.sorted.bam $projPath/alignment/bam/${histName}_bowtie2.mapped.bam  
done

cat filenames| while read histName
do
samtools index $projPath/alignment/bam/${histName}.sorted.bam
done


chromSize="/data1/jiawei_li/yizhi_chen/foxg1_cut_tag/raw_data_fasta/GRCm39.chrom.sizes"

# If you had run the command with --outFileFormat bedgraph, you could easily peak into the resulting file.
cat filenames| while read histName
do
bamCoverage -b ${projPath}/alignment/bam/${histName}.sorted.bam -o $projPath/alignment/bam/${histName}_normalized.bw \
    --binSize 10 \
    --normalizeUsing RPGC \
    --effectiveGenomeSize 3388488650 \
    --extendReads \
    --outFileFormat bedgraph
done

# Bigwig
cat filenames| while read histName
do
bamCoverage -b ${projPath}/alignment/bam/${histName}.sorted.bam -o $projPath/alignment/bam/${histName}_normalized.bw \
    --binSize 10 \
    --normalizeUsing RPGC \
    --effectiveGenomeSize 3388488650 \
    --extendReads 
done


  • 最长片段880,不需要clean

remove dup file format convert

##== linux command ==##

cat filenames| while read histName
do
samtools view -bS -F 0x04 $projPath/alignment/removeDuplicate/${histName}_bowtie2.sorted.rmDup.sam >$projPath/alignment/bam/${histName}_rmDup.bam
done

mkdir -p $projPath/alignment/bigwig                                                                                                                                        

cat filenames| while read histName
do
samtools sort -o $projPath/alignment/bam/${histName}_rmDup.sorted.bam $projPath/alignment/bam/${histName}_rmDup.bam  
samtools index $projPath/alignment/bam/${histName}_rmDup.sorted.bam
done


chromSize="/data1/jiawei_li/yizhi_chen/foxg1_cut_tag/raw_data_fasta/GRCm39.chrom.sizes"

# If you had run the command with --outFileFormat bedgraph, you could easily peak into the resulting file.
cat filenames| while read histName
do
bamCoverage -b ${projPath}/alignment/bam/${histName}_rmDup.sorted.bam -o $projPath/alignment/bam/${histName}_rmDup_normalized.bw \
    --binSize 10 \
    --normalizeUsing RPGC \
    --effectiveGenomeSize 3388488650 \
    --extendReads \
    --outFileFormat bedgraph
#bigwig
bamCoverage -b ${projPath}/alignment/bam/${histName}_rmDup.sorted.bam -o $projPath/alignment/bam/${histName}_rmDup_normalized.bigwig \
    --binSize 10 \
    --normalizeUsing RPGC \
    --effectiveGenomeSize 3388488650 \
    --extendReads 
done


SEACR call peaks(不推荐于做低丰度蛋白peak call工具)

tutorial

cd $projPath/alignment/bedgraph

cat treatnames  | while read treat
do
bash $projPath/SEACR_1.3.sh ${treat}_bowtie2.fragments.normalized.bedgraph IgG_bowtie2.fragments.normalized.bedgraph non stringent ${treat}_SEACR
done

remove dup

  • stringent
cat treatnames  | while read treat
do
bash $projPath/SEACR_1.3.sh ${treat}_rmDup_normalized.bw IgG_rmDup_normalized.bw norm stringent ${treat}_rmDup
done
  • relax
cat treatnames  | while read treat
do
bash $projPath/SEACR_1.3.sh ${treat}_rmDup_normalized.bw IgG_rmDup_normalized.bw non relax ${treat}_SEACR
done

from BamCoverage


cd $projPath/alignment/bam

cat treatnames  | while read treat
do
bash $projPath/SEACR_1.3.sh ${treat}_normalized.bw IgG_normalized.bw non stringent ${treat}_SEACR
done

...

MACS2 call peaks

## --shift -100 --extsize 200 --nomodel -q 0.05

cat treatnames  | while read treat
do
macs2 callpeak -t ${treat}.sorted.bam -f BAMPE -g mm -q 0.05 -B --SPMR -n ${treat} --outdir ./MACS2 2>${treat}.macs2.log
done

## --shift -100 --extsize 200 --nomodel -p 0.05

cat treatnames  | while read treat
do
macs2 callpeak -t ${treat}.sorted.bam -f BAMPE -g mm -p 0.05 -B --SPMR -n ${treat} --outdir ./MACS2 2>${treat}.macs2.log
done

q0.05

p0.05

p0.01

## --shift -100 --extsize 200 --nomodel -c

cat treatnames  | while read treat
do
macs2 callpeak -t ${treat}.sorted.bam -c IgG.sorted.bam -f BAMPE -g mm -p 0.05 -B --SPMR -n ${treat} --outdir ./MACS2 2>${treat}.macs2.log
done

q 0.05

p0.05

GOPeaks

chromSize="/data1/jiawei_li/yizhi_chen/foxg1_cut_tag/raw_data_fasta/GRCm39.chrom.sizes"
cat filenames| while read histName
do
gopeaks -b ${histName}.sorted.bam -c IgG.sorted.bam -s $chromSize -o ./$histName
done

remove duplicate

chromSize="/data1/jiawei_li/yizhi_chen/foxg1_cut_tag/raw_data_fasta/GRCm39.chrom.sizes"
cat filenames| while read histName
do
gopeaks -b ${histName}_rmDup.sorted.bam -c IgG_rmDup.sorted.bam -s $chromSize -o ./$histName
done

CHIPSeeker

Galaxy

R

##== R command ==##

library("ChIPseeker")
library("org.Mm.eg.db")
library("GenomicFeatures")
#Mus_musculus.GRCm39.109.gtf在bioinformatic/GRCm39 ref文件夹中
txdb <- makeTxDbFromGFF("E:/硕士/bioinformatics/GRCm39 ref/Mus_musculus.GRCm39.109.gtf")
library("clusterProfiler")
library("ggplot2")
library("ggimage")
library("UpSetR")
library("ggpubr")
setwd=("E:/硕士/bioinformatics/Foxg1 CUT&TAG/重测序/peak")
Foxg1 <- readPeakFile("Foxg1_peaks.narrowPeak")
Flag <- readPeakFile("Flag_peaks.narrowPeak")
Foxg1
#GRanges object with 39471 ranges and 0 metadata columns:
#seqnames            ranges strand
#<Rle>         <IRanges>  <Rle>
#[1]        1   3165101-3165400      *
#[2]        1   3443001-3443400      *
#[3]        1   3462101-3462450      *
# [4]        1   3465301-3466050      *
# [5]        1   3470351-3470750      *
# ...      ...               ...    ...
#[39467]        Y 90810551-90812100      *
#[39468]        Y 90814801-90815450      *
#[39469]        Y 90818601-90820150      *
#[39470]        Y 90821901-90822350      *
#[39471]        Y 90851801-90853350      *
#-------
#seqinfo: 23 sequences from an unspecified genome; no seqlengths

# 制作多个样本比较的list
peaks <- list(FOXG1=Foxg1,FLAG=Flag)
# promotor区间范围可以自己设定
promoter <- getPromoters(TxDb=txdb, upstream=3000, downstream=3000)
tagMatrixList <- lapply(peaks, getTagMatrix, windows=promoter)
plotAvgProf(tagMatrixList, xlim=c(-3000, 3000))
plotAvgProf(tagMatrixList, xlim=c(-3000, 3000), conf=0.95,resample=500, facet="row")
plotPeakProf2(peaks, upstream = 3000, downstream = 3000, conf = 0.95,
              by = "gene", type = "start_site", TxDb = txdb,# by = 'gene', 'transcript', 'exon' or features of interest(e.g. "enhancer")
              facet = "row", nbin = 800)
plotPeakProf2(peaks, upstream = rel(0.2), downstream = rel(0.2),
              conf = 0.95, by = "gene", type = "body",
              TxDb = txdb, facet = "row", nbin = 800)
#tagHeatmap(tagMatrixList)


# annotatePeak传入annoDb参数,可进行基因ID转换(Entrez,ENSEMBL,SYMBOL,GENENAME)
peakAnnoList <- lapply(peaks, annotatePeak, TxDb=txdb,tssRegion=c(-3000, 3000), verbose=FALSE)#annoDb="org.Mm.eg.db"

write.table(as.data.frame(peakAnnoList$FOXG1), file="FOXG1_peakAnno.xls", quote = F, row.names = F, sep = "\t")
write.table(as.data.frame(peakAnnoList$FLAG), file="FLAG_peakAnno.xls", quote = F, row.names = F, sep = "\t")

## 可视化
plotAnnoBar(peakAnnoList)
plotDistToTSS(peakAnnoList,title="Distribution of transcription factor-binding loci \n relative to TSS")

# 提取peakAnnolist中的基因,结合clusterProfiler包对peaks内的邻近基因进行富集注释。
# Create a list with genes from each sample
gene = lapply(peakAnnoList, function(i) as.data.frame(i)$geneId)
vennplot(gene)

# Run GO enrichment analysis 
# list用compareCluster函数做富集分析
ego <- compareCluster(geneCluster = gene,
                      fun = "enrichGO",
                      keyType = "ENSEMBL", 
                      OrgDb = org.Mm.eg.db, 
                      ont = "ALL",  #One of "BP", "MF", and "CC" subontologies, or "ALL" for all three.
                      pAdjustMethod = "BH",
                      pvalueCutoff = 0.05,
                      qvalueCutoff = 0.05, 
                      readable = TRUE)
ego_BP <- compareCluster(geneCluster = gene,
                      fun = "enrichGO",
                      keyType = "ENSEMBL", 
                      OrgDb = org.Mm.eg.db, 
                      ont = "BP",  #One of "BP", "MF", and "CC" subontologies, or "ALL" for all three.
                      pAdjustMethod = "BH",
                      pvalueCutoff = 0.05,
                      qvalueCutoff = 0.05, 
                      readable = TRUE)
ego_MF <- compareCluster(geneCluster = gene,
                      fun = "enrichGO",
                      keyType = "ENSEMBL", 
                      OrgDb = org.Mm.eg.db, 
                      ont = "MF",  #One of "BP", "MF", and "CC" subontologies, or "ALL" for all three.
                      pAdjustMethod = "BH",
                      pvalueCutoff = 0.05,
                      qvalueCutoff = 0.05, 
                      readable = TRUE)
ego_CC <- compareCluster(geneCluster = gene,
                      fun = "enrichGO",
                      keyType = "ENSEMBL", 
                      OrgDb = org.Mm.eg.db, 
                      ont = "CC",  #One of "BP", "MF", and "CC" subontologies, or "ALL" for all three.
                      pAdjustMethod = "BH",
                      pvalueCutoff = 0.05,
                      qvalueCutoff = 0.05, 
                      readable = TRUE)

# Dotplot visualization
BP=dotplot(ego_BP,
        includeAll=FALSE,
        showCategory=15,
        font.size=10,
        title = "GO:BP Pathway Enrichment Analysis")
MF=dotplot(ego_MF,
           includeAll=FALSE,
           showCategory=15,
           font.size=10,
           title = "GO:MF Pathway Enrichment Analysis")
CC=dotplot(ego_CC,
           includeAll=FALSE,
           showCategory=15,
           font.size=10,
           title = "GO:CC Pathway Enrichment Analysis")
pdf(file = "GO.pdf" , height = 10 ,width = 20 )
ggarrange(BP,MF,CC ,nrow = 1 ,align = "h")
dev.off()

# 也可以分开来做
ego1 <- enrichGO(gene = gene$FOXG1, 
                 keyType = "ENSEMBL", 
                 OrgDb = org.Mm.eg.db, 
                 ont = "BP", 
                 pAdjustMethod = "BH",
                 pvalueCutoff = 0.05,
                 qvalueCutoff = 0.05, 
                 readable = TRUE)
ego2 <- enrichGO(gene = gene$FLAG, 
                 keyType = "ENSEMBL", 
                 OrgDb = org.Mm.eg.db, 
                 ont = "BP", 
                 pAdjustMethod = "BH",
                 pvalueCutoff = 0.05,
                 qvalueCutoff = 0.05, 
                 readable = TRUE)


# Multiple samples KEGG analysis
##提取KEGG背景
library(msigdbr)
msigdbr_species()
mouse_KEGG = msigdbr(species = "Mus musculus",
                     category = "C2", 
                     subcategory = "CP:KEGG") %>% dplyr::select(gs_name,ensembl_gene)


compKEGG <- compareCluster(geneCluster = gene, 
                           fun = "enricher",
                           TERM2GENE = mouse_KEGG,
                           pvalueCutoff  = 0.05, 
                           pAdjustMethod = "BH")
KEGG=dotplot(compKEGG, showCategory = 20, title = "KEGG Pathway Enrichment Analysis")
pdf(file = "KEGG.pdf" , height = 10 ,width = 10 )
KEGG
dev.off()
pdf(file = "Total enrich.pdf",height = 22,width = 22)
ggarrange(BP,MF,CC,KEGG ,ncol = 2 ,nrow = 2 ,align = "hv")
dev.off()
pdf(file = "Total enrich2.pdf",height = 12,width = 25)
ggarrange(BP,MF,CC,KEGG ,nrow = 1 )
dev.off()


##covplot

covplot_Foxg1=covplot(Foxg1, title = "Foxg1 CUT&TAG Peaks over Chromosomes",
                      weightCol="V5",chrs=c("1", "2", "3", "4", 
                                            "5", "6", "7", 
                                            "8", "9", "10",
                                            "11", "12", "13",
                                            "14", "15", "16",
                                            "17", "18", "19"))
covplot_Flag=covplot(Flag, title = "Flag CUT&TAG Peaks over Chromosomes",
                     weightCol="V5",chrs=c("1", "2", "3", "4", 
                                           "5", "6", "7", 
                                           "8", "9", "10",
                                           "11", "12", "13",
                                           "14", "15", "16",
                                           "17", "18", "19"))
pdf(file = "covplot.pdf",height = 10,width = 20)
ggarrange(covplot_Foxg1,covplot_Flag,nrow = 1)
dev.off()





Deeptools


cores=20

##total                       
computeMatrix scale-regions -S $projPath/alignment/bam/Foxg1_normalized.bw \
                               $projPath/alignment/bam/Flag_normalized.bw \
                               -R $projPath/Mus_musculus.GRCm39.109.bed12 \
                              --beforeRegionStartLength 2000 \
                              --regionBodyLength 5000 \
                              --afterRegionStartLength 2000 \
                              -o $projPath/TOTAL_TSSTES_matrix_gene_at_genome.mat.gz \
                              -p $cores --skipZeros
                              

plotHeatmap -m $projPath/TOTAL_TSSTES_matrix_gene_at_genome.mat.gz \
     -out TOTAL_TSSTESHeatmap.pdf  \
     --colorMap OrRd \
     --whatToShow 'plot, heatmap and colorbar' \
     --heatmapHeight 17 \
     --legendLocation best

##TSS
#--referencePoint TSS -b 5000 -a 5000
computeMatrix reference-point -p $cores \
                              --referencePoint TSS \
                              -b 5000 -a 5000 \
                              -R $projPath/Mus_musculus.GRCm39.109.bed12 \
                              -S $projPath/alignment/bigwig/Foxg1_raw.bw \
                              --skipZeros  \
                              -out $projPath/Foxg1_TSS_matrix_gene_at_genome.mat.gz

plotHeatmap -m $projPath/Foxg1_TSS_matrix_gene_at_genome.mat.gz \
     -out Foxg1TSS.peakHeatmap.pdf \
     --outFileSortedRegions  $projPath/alignment/clusterbed/FOXG1_TSS_cluster_regions.bed \
     --colorMap OrRd \
     --whatToShow 'plot, heatmap and colorbar' \
     --kmeans 5   \
     --heatmapHeight 15 \
     --legendLocation best
     
##peak
###foxg1
computeMatrix reference-point -p $cores \
                              --referencePoint center \
                              -a 3000 -b 3000 \
                              -R $projPath/alignment/MACS2/Foxg1_peaks.narrowPeak \
                              -S $projPath/alignment/bam/Foxg1_normalized.bw \
                              --skipZeros  \
                              -out $projPath/FOXG1peak_matrix_gene.mat.gz
                              
plotHeatmap -m $projPath/FOXG1peak_matrix_gene.mat.gz \
     -out FOXG1.peakHeatmap.pdf \
     --outFileSortedRegions  $projPath/alignment/clusterbed/FOXG1_cluster_regions.bed \
     --colorMap OrRd \
     --whatToShow 'plot, heatmap and colorbar' \
     --kmeans 5   \
     --heatmapHeight 15 \
     --legendLocation best
     
###flag
computeMatrix reference-point -p $cores \
                              --referencePoint center \
                              -b 3000 -a 3000 \
                              -R $projPath/alignment/MACS2/Flag_peaks.narrowPeak \
                              -S $projPath/alignment/bam/Flag_normalized.bw \
                              --skipZeros  \
                              -out $projPath/FLAGpeak_matrix_gene.mat.gz
                              
plotHeatmap -m $projPath/FLAGpeak_matrix_gene.mat.gz \
     -out FLAG.peakHeatmap.pdf \
     --colorMap OrRd \
     --whatToShow 'plot, heatmap and colorbar' \
     --heatmapHeight 15 \
     --legendLocation best


gimmemotifs

total

gimme motifs Foxg1_peaks.narrowPeak FOXG1.motif -g ~/.local/share/genomes/Mus_musculus.GRCm39.dna.toplevel -p HOMER --known
gimme motifs Flag_peaks.narrowPeak FLAG.motif -g ~/.local/share/genomes/Mus_musculus.GRCm39.dna.toplevel -p HOMER --known

cluster

gimme maelstrom -p HOMER for_motif_FOXG1_peakcenter ~/.local/share/genomes/Mus_musculus.GRCm39.dna.toplevel ./cluster_motif_res_Homer
gimme maelstrom -p JASPAR2020_vertebrates for_motif_FOXG1_peakcenter ~/.local/share/genomes/Mus_musculus.GRCm39.dna.toplevel ./motif_res_JASPAR_FOXG1_peakcenter
from gimmemotifs.maelstrom import MaelstromResult
import matplotlib.pyplot as plt
mr = MaelstromResult("/data1/jiawei_li/yizhi_chen/foxg1_cut_tag/raw_data_fasta/alignment/clusterbed/motif_res")
mr.plot_heatmap(threshold=1,figsize=(6,8))
plt.savefig("/data1/jiawei_li/yizhi_chen/foxg1_cut_tag/raw_data_fasta/alignment/clusterbed/motif_res_Homer_FOXG1_peakcenter/FOXG1_231201_motif_heatmap_Homer.pdf")
homer_res

jaspar

#23.12.1
#/data1/jiawei_li/yizhi_chen/foxg1_cut_tag/raw_data_fasta/alignment/clusterbed/cluster_motif_res_Homer
from gimmemotifs.maelstrom import MaelstromResult
import matplotlib.pyplot as plt
mr = MaelstromResult("/data1/jiawei_li/yizhi_chen/foxg1_cut_tag/raw_data_fasta/alignment/clusterbed/cluster_motif_res_Homer")
mr.plot_heatmap(threshold=1,figsize=(5,10))
plt.savefig("/data1/jiawei_li/yizhi_chen/foxg1_cut_tag/raw_data_fasta/alignment/clusterbed/cluster_motif_res_Homer/FOXG1_2308_motif_heatmap_Homer.pdf")

ChipSeeker

library("ChIPseeker")
library("org.Mm.eg.db")
library("GenomicFeatures")
library("clusterProfiler")
library("ggplot2")
library("ggimage")
library("UpSetR")

setwd("E:/硕士/bioinformatics/Foxg1 CUT&TAG/重测序/cluster")

txdb <- makeTxDbFromGFF("E:/硕士/bioinformatics/GRCm39 ref/Mus_musculus.GRCm39.109.gtf")
cluster1 <- readPeakFile("c1.bed")
cluster2 <- readPeakFile("c2.bed")
cluster3 <- readPeakFile("c3.bed")
cluster4 <- readPeakFile("c4.bed")
cluster5 <- readPeakFile("c5.bed")
peaks <- list(cluster1=cluster1,
              cluster2=cluster2,
              cluster3=cluster3,
              cluster4=cluster4,
              cluster5=cluster5)
promoter <- getPromoters(TxDb=txdb, upstream=3000, downstream=3000)
tagMatrixList <- lapply(peaks, getTagMatrix, windows=promoter)
tagHeatmap(tagMatrixList)

peakAnnoList <- lapply(peaks, annotatePeak,
                       TxDb=txdb,tssRegion=c(-3000, 3000), 
                       verbose=FALSE)
plotAnnoBar(peakAnnoList,xlab="", ylab='Percentage(%)')
plotDistToTSS(peakAnnoList,title="Distribution of transcription factor-binding loci \n relative to TSS")
gene = lapply(peakAnnoList, function(i) as.data.frame(i)$geneId)
vennplot(gene)

# Run GO enrichment analysis 
ego <- compareCluster(geneCluster = gene,
                      fun = "enrichGO",
                      keyType = "ENSEMBL", 
                      OrgDb = org.Mm.eg.db, 
                      ont = "BP", 
                      pAdjustMethod = "BH",
                      pvalueCutoff = 0.05,
                      qvalueCutoff = 0.05, 
                      readable = TRUE)

# Dotplot visualization
A = dotplot(ego,
            includeAll=FALSE,
            showCategory=10,
            font.size=10,
            title = "GO:BP Pathway Enrichment Analysis")
#KEGG
library(msigdbr)
msigdbr_species()
mouse_KEGG = msigdbr(species = "Mus musculus",
                     category = "C2", 
                     subcategory = "CP:KEGG") %>% dplyr::select(gs_name,ensembl_gene)


compKEGG <- compareCluster(geneCluster = gene, 
                           fun = "enricher",
                           TERM2GENE = mouse_KEGG,
                           pvalueCutoff  = 0.05, 
                           pAdjustMethod = "BH")
B = dotplot(compKEGG, showCategory = 10, 
            #color = "pvalue",
            title = "KEGG Pathway Enrichment Analysis" ,shape = T)
C=A+B
C
pdf(file = "GO + KEGG.pdf" , width = 20,height = 15)
dev.off()

write.table(compKEGG ,file = "KEGG.xls" , sep = "\t" ,quote = F ,row.names = F )
write.table(ego ,file = "GO.xls" , sep = "\t" ,quote = F ,row.names = F )




bedVis

library(transPlotR)
library(rtracklayer)
library(aplot)
library(grid)
library(ggplotify)
bdfile <- list.files(pattern = '.bed')
bdfile
bedVis(bdFile = bdfile,
       chr = "7",
       track.width = 0.3,
       show.legend = T)

gtf <- BiocIO::import('E:/硕士/bioinformatics/GRCm39 ref/Mus_musculus.GRCm39.109.gtf',format = "gtf") %>% data.frame() 

#loadBigWig
file <- list.files(pattern = '.bw')
mybw <- loadBigWig(file)
# check
head(mybw,3)
# color aes by transcript
p <-
  trancriptVis(gtfFile = gtf,
               gene = "Ube3a",
               relTextDist = -0.5,
               exonWidth = 0.5,
               exonFill = 'black',
               arrowCol = 'black',
               textLabelSize = 3,
               addNormalArrow = FALSE,
               newStyleArrow = TRUE,
               xAxis.info = FALSE,
               textLabel = 'transcript_name',
               circle = F,collapse = T) +
  xlab('')
# track + gene + peak
Ube3a <-
  trackVis(bWData = mybw,
           gtf.file = gtf,
           gene.name = "Ube3a",
           extend.up = 3000,
           extend.dn = 3000,
           xAxis.info = F,
           theme = "bw",
           yAxis.info = F,new.yaxis = T,new.label = T,
           pos.ratio = c(0.06,0.8),
           color = jjAnno::useMyCol(platte = "stallion",n = 3))

# peak
bd1 <- bedVis(bdFile = bdfile,
             chr = "7",
             track.width = 0.5,
             show.legend = F,
             fill=jjAnno::useMyCol(platte = "stallion",n = 2))

# combine
c1 <- Ube3a %>% insert_bottom(p,height = 0.30) %>%
  insert_bottom(bd1,height = 0.10)
c1
c1 <- as.grob(c1)
ggsave("Ube3a.pdf")

# color aes by transcript
p2 <-
  trancriptVis(gtfFile = gtf,
               gene = "Ube3c",
               relTextDist = -0.5,
               exonWidth = 0.5,
               exonFill = 'black',
               arrowCol = 'black',
               textLabelSize = 3,
               addNormalArrow = FALSE,
               newStyleArrow = TRUE,
               xAxis.info = FALSE,
               textLabel = 'transcript_name',
               circle = F,collapse = T) +
  xlab('')
# track + gene + peak
Ube3c <-
  trackVis(bWData = mybw,
           gtf.file = gtf,
           gene.name = "Ube3c",
           extend.up = 3000,
           extend.dn = 3000,
           xAxis.info = F,
           theme = "bw",
           yAxis.info = F,new.yaxis = T,new.label = T,
           pos.ratio = c(0.06,0.8),
           color = jjAnno::useMyCol(platte = "stallion",n = 3))

# peak
bd2 <- bedVis(bdFile = bdfile,
             chr = "5",
             track.width = 0.3,
             show.legend = F,
             fill=jjAnno::useMyCol(platte = "stallion",n = 2))

# combine
c2 <- Ube3c %>% insert_bottom(p2,height = 0.3) %>%
  insert_bottom(bd2,height = 0.1)  
c2
c2 <- as.grob(c2)
ggsave("Ube3c.pdf")

# combine
library(gridExtra)
grid.arrange(c1, c2, ncol=2)
ggsave("Ube3c.pdf")


Violin Plot

library("ChIPseeker")
library("org.Mm.eg.db")
library("GenomicFeatures")
library("clusterProfiler")
library("ggplot2")
library("ggimage")
library("UpSetR")
library("ggGenshin")
library("GeneOverlap")
library("ggVennDiagram")
library("VennDiagram")

txdb <- makeTxDbFromGFF("E:/硕士/bioinformatics/GRCm39 ref/Mus_musculus.GRCm39.109.gtf")
cluster1 <- readPeakFile("c1.bed")
cluster2 <- readPeakFile("c2.bed")
cluster3 <- readPeakFile("c3.bed")
cluster4 <- readPeakFile("c4.bed")
cluster5 <- readPeakFile("c5.bed")
peaks <- list(cluster1=cluster1,
              cluster2=cluster2,
              cluster3=cluster3,
              cluster4=cluster4,
              cluster5=cluster5)
promoter <- getPromoters(TxDb=txdb, upstream=3000, downstream=3000)
tagMatrixList <- lapply(peaks, getTagMatrix, windows=promoter)
tagHeatmap(tagMatrixList)

peakAnnoList <- lapply(peaks, annotatePeak,
                       TxDb=txdb,tssRegion=c(-3000, 3000), 
                       verbose=FALSE)
plotAnnoBar(peakAnnoList,xlab="", ylab='Percentage(%)')
plotDistToTSS(peakAnnoList,title="Distribution of transcription factor-binding loci \n relative to TSS")
gene = lapply(peakAnnoList, function(i) as.data.frame(i)$geneId)

peak <- readPeakFile("FOXG1_cluster_regions.xls")
peakanno <- annotatePeak(peak,
                         TxDb = txdb,
                         tssRegion=c(-3000, 3000),
                         verbose=FALSE)
write.csv(peakanno,"total_peak_anno.csv")
peakanno <- as.data.frame(peakanno)
OE_vs_HD <- read.table("OE_vs_HD_result_data_anno.xls",sep = "\t",header = T)
merge <- merge(peakanno,OE_vs_HD,by = "geneId")

# filter by LFC and padj
merge1 <- merge[(abs(merge$log2FoldChange)< 7),]
merge_filt<- merge[(abs(merge$log2FoldChange)>= 0.5 & abs(merge$log2FoldChange)< 7 &
                                 (merge$pvalue<= 0.05)),] 
merge_filt<- as.data.frame(merge_filt)
merge_filt <- merge_filt[!is.na(merge_filt$cluster_1), ]
# Violin plot of DEG distribution in each cluster LFC.cutoff=0.5 for dots
#my_palette <- brewer.pal(name="Blues",n=8)[3:8]
ggGenshin::keys()
my_palette <- pal_xiao(alpha = 1)(5)
p_Foxg1 <- ggplot(merge1, 
                  aes(x=cluster_1, y=log2FoldChange, fill=cluster_1, color= cluster_1, alpha=0.5, font=12))+ 
  scale_color_manual(values = my_palette, aesthetics = "fill")+
  scale_color_manual(values = my_palette, aesthetics = "colour")+
  geom_violin()+ 
  labs(x="cluster", y = "Log2FC")+ theme_light()+
  stat_summary(fun=median, geom="point", size=2, color="black") +
  theme(axis.text.x = element_text(size=12,angle =60),
        axis.text.y = element_text(size=12),
        axis.title = element_text(size=12))

violin_p_Foxg1<- p_Foxg1 + geom_jitter( data= merge_filt, 
                                           shape=16, 
                                           size=2.5,alpha = 0.5,
                                           position=position_jitter(width=0.2, height= 0.2))+ 
  stat_summary(fun=median, geom="point", size=2, color="black")
violin_p_Foxg1
ggsave("violin_plot_OE_vs_HD.pdf")


##KO vs WT
KO_vs_WT <- read.table("KO_vs_WT_result_data_anno.xls",sep = "\t",header = T)
mergeKO <- merge(peakanno,KO_vs_WT,by = "geneId")

# filter by LFC and padj
mergeKO1 <- mergeKO[(abs(mergeKO$log2FoldChange)< 7),]
mergeKO_filt<- mergeKO[(abs(mergeKO$log2FoldChange)>= 0.5 & abs(mergeKO$log2FoldChange)< 7 &
                      (mergeKO$padj<= 0.05)),] 
mergeKO_filt<- as.data.frame(mergeKO_filt)
mergeKO_filt <- mergeKO_filt[!is.na(mergeKO_filt$cluster_1), ]
# Violin plot of DEG distribution in each cluster LFC.cutoff=0.5 for dots
#my_palette <- brewer.pal(name="Blues",n=8)[3:8]
ggGenshin::keys()
my_palette <- pal_nahida(alpha = 1)(5)
p_Foxg1_2 <- ggplot(mergeKO1, 
                  aes(x=cluster_1, y=log2FoldChange, fill=cluster_1, color= cluster_1, alpha=0.5, font=12))+ 
  scale_color_manual(values = my_palette, aesthetics = "fill")+
  scale_color_manual(values = my_palette, aesthetics = "colour")+
  geom_violin()+ 
  labs(x="cluster", y = "Log2FC")+ theme_light()+
  stat_summary(fun=median, geom="point", size=2, color="black") +
  theme(axis.text.x = element_text(size=12,angle =60),
        axis.text.y = element_text(size=12),
        axis.title = element_text(size=12))
p_Foxg1_2
violin_p_Foxg1_2<- p_Foxg1_2 + geom_jitter( data= mergeKO_filt, 
                                        shape=16, 
                                        size=2.5,alpha = 0.5,
                                        position=position_jitter(width=0.2, height= 0.2))+ 
  stat_summary(fun=median, geom="point", size=2, color="black")
violin_p_Foxg1_2


###correlation
# read in K-means clustered peaks files (from the heatmap above) 
cluster1 <- peakanno[(peakanno$cluster_1 == "cluster_1"),]
cluster2 <- peakanno[(peakanno$cluster_1 == "cluster_2"),]
cluster3 <- peakanno[(peakanno$cluster_1 == "cluster_3"),]
cluster4 <- peakanno[(peakanno$cluster_1 == "cluster_4"),]
cluster5 <- peakanno[(peakanno$cluster_1 == "cluster_5"),]
# h3k27ac cluster list with annotations (gain, loss, static)
foxg1_list<- list(static_foxg1= cluster1$geneId,
                  loss1_foxg1= cluster2$geneId,
                  loss2_foxg1= cluster3$geneId,
                  gain_foxg1= cluster4$geneId,
                  static_no_foxg1= cluster5$geneId)

# Upload DEG table
OE_vs_HD
KO_vs_WT
# filter according to increased and decreased gene expression
increased_DEG_OE<-OE_vs_HD[(OE_vs_HD$log2FoldChange>=0.5  &
                                   OE_vs_HD$pvalue<=0.05),]
decreased_DEG_OE<-OE_vs_HD[(OE_vs_HD$log2FoldChange<= -0.5 &
                                   OE_vs_HD$pvalue<=0.05),]
static_DEG_OE<-OE_vs_HD[(abs(OE_vs_HD$log2FoldChange)<=0.5 & OE_vs_HD$pvalue>0.05),]
# combine filtered DEGs in a list
DEG_list_OE<- list(increased_DEG= increased_DEG_OE$geneId,
                decreased_DEG=decreased_DEG_OE$geneId,
                static=static_DEG_OE$geneId)
# Prepare Geneoverlap matrix for enrichment test- clustered H3K27ac-DEG overlap
GO_matrix_foxg1OE<-newGOM(foxg1_list, DEG_list_OE, genome.size = 24500)

# oddsratio heatmap for strength of association between lists
heatmap_OE<- drawHeatmap(GO_matrix_foxg1OE, 
                            what = c("odds.ratio"), 
                            adj.p=T, 
                            cutoff=0.05, 
                            ncolused = 5,
                            grid.col = "Blues",
                            note.col = "Black")

##KO
# filter according to increased and decreased gene expression
increased_DEG_KO<-KO_vs_WT[(KO_vs_WT$log2FoldChange>=0.5  &
                              KO_vs_WT$padj<=0.05),]
decreased_DEG_KO<-KO_vs_WT[(KO_vs_WT$log2FoldChange<= -0.5 &
                              KO_vs_WT$padj<=0.05),]
static_DEG_KO<-KO_vs_WT[(abs(KO_vs_WT$log2FoldChange)<=0.5 & KO_vs_WT$padj>0.05),]
# combine filtered DEGs in a list
DEG_list_KO<- list(increased_DEG= increased_DEG_KO$geneId,
                   decreased_DEG=decreased_DEG_KO$geneId,
                   static=static_DEG_KO$geneId)
# Prepare Geneoverlap matrix for enrichment test- clustered H3K27ac-DEG overlap
GO_matrix_foxg1KO<-newGOM(foxg1_list, DEG_list_KO, genome.size = 24500)
GO_matrix_foxg1KO
#> GO_matrix_foxg1KO
#A <5 x 3> GeneOverlapMatrix object
#Geneset A sizes:
#  static_foxg1     loss1_foxg1     loss2_foxg1      gain_foxg1 static_no_foxg1 
#1790            9775           11413           29431           30564 
#Geneset B sizes:
#  increased_DEG decreased_DEG        static 
#1239          1655         12491 

# oddsratio heatmap for strength of association between lists
heatmap_KO<- drawHeatmap(GO_matrix_foxg1KO, 
                         what = c("odds.ratio"),
                         log.scale = F,
                         adj.p=T, 
                         cutoff=0.05, 
                         ncolused = 5,
                         grid.col = "Blues",
                         note.col = "Black")


Foxg1_KD_DEGs<-read.table("DE_genes_shrinked_apeglm_DIV11.tabular", 
                          sep="\t", header = TRUE,)
Foxg1_KD_DEGs_df<- as.data.frame(Foxg1_KD_DEGs)
Foxg1_KD_DEGs_df$log2FoldChange<-as.numeric(gsub(",", ".", Foxg1_KD_DEGs_df$log2FoldChange))
nrow(Foxg1_KD_DEGs_df)
nrow(Foxg1_KD_DEGs)

OE vs HD

KO vs WT